The structure of Orphanet Rare Disease Ontology was queried and processed using R interface of the Ontology Lookup Service (https://lgatto.github.io/rols/index.html). A number of calculation per-computed for further analyses on this section was performed in source/Orphanet_annotate_genes_to_ancestors.R.
# load direct gene association
orpha_gene_onset_df <- readRDS("../cache/orpha_gene_onset_df.RDS")
# disease gene association with roots
orphanet_gene_association <- read_tsv("../data/orphaNet_disease_gene_association_with_roots.tsv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## orphaID = col_character(),
## label = col_character(),
## genes = col_character(),
## n_genes = col_double(),
## description = col_character(),
## roots = col_character()
## )
# disease gene association at group level
source("../functions/readdata_functions.R")
gene_disease_orpha = process_disease_genes_data("../data/table_disease_gene_assoc_orphanet_genetic.tsv", 1, 2000)
## reading disease gene association data
## [1] "read 28 diseases, of total 3593 associated genes."
#source("../source/read_orphanet_gene_association_data.R")
gene_disease_orpha = gene_disease_orpha$disgene_df
# Modify and merge data
orpha_gene_display_df <- orphanet_gene_association %>%
dplyr::filter(n_genes > 0) %>%
mutate(ID = as.double(str_remove(orphaID, "Orphanet:")))
DT::datatable(orpha_gene_display_df[,c("ID", "label", "n_genes", "genes")] ,
extensions = 'Buttons',
options = list(dom = 'Blfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All"))))
Rare diseases are scarcely annotated, and most disease terms (2686 out of 3771) are only associated with one gene. Network-based measurements for individual diseases are unfeasible and grouping of the terms for higher level association are necessary.
# from orphanet_mapping_top_branch
gene_per_disease = orpha_gene_onset_df %>%
dplyr::filter(!is.na(gene)) %>%
count(orphaID)
gene_per_disease_count = gene_per_disease %>%
mutate(group = cut(n, breaks = c(0:10, 100), labels = c(1:10, "> 10"))) %>%
count(group)
p = ggplot(gene_per_disease_count, aes(x = group, y =n)) + geom_col() +
ggtitle("Most orphanet diseases are immediately associated with one gene") +
theme_minimal() + ylab("Number of diseases") + xlab("Number of genes per disease")#+ scale_y_log10()
plotly::ggplotly(p)
pacman::p_load(cowplot)
#gene_per_disease_group = gene_disease_orpha %>%
# mutate(group = cut(n_genes, breaks = c(seq(0,100,20), seq(200,1000, 200), Inf))) %>% count(group)
#ggplot(gene_per_disease_group, aes(x = group, y =n)) + geom_col() +
# ggtitle("Most orphanet diseases are immediately associated with one gene") +
# theme_minimal() + ylab("Number of diseases") + xlab("Number of genes per disease")#+ scale_y_log10()
# number shift
gene_per_disease_group = gene_disease_orpha %>%
mutate(disease = "Grouped", n = N) %>%
select(disease, n)
gene_per_disease = gene_per_disease %>%
mutate(disease = "Individual")
gene_per_disease_both = gene_per_disease %>%
select(disease, n) %>%
bind_rows(., gene_per_disease_group) %>%
dplyr::filter(n>0) %>%
# relevel factor
mutate(disease = factor(disease, levels = c("Individual","Grouped")))
# add break values for plotting on log scale on x axis
breakvals = c(0:9, seq(10,90,10), seq(100,1000,100), 2000)
gene_per_disease_both_count = gene_per_disease_both %>%
mutate(group = cut(n, breaks = breakvals, include.lowest = T, labels = breakvals[-1])) %>%
group_by(disease, group) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
full_join(., tibble(group = as_factor(breakvals[-1])))
## `summarise()` has grouped output by 'disease'. You can override using the `.groups` argument.
## Joining, by = "group"
gene_per_disease_both_count$disease[is.na(gene_per_disease_both_count$disease)] = "Grouped"
gene_per_disease_both_count$n[is.na(gene_per_disease_both_count$n)] = 0
# plot
#ggplot(gene_per_disease_both_count, aes(y = prop, x = group)) + geom_col() + theme_minimal() +facet_grid(disease ~ .)
p = ggplot(gene_per_disease_both_count, aes(y = n, x = group)) +
geom_col() +
facet_grid(disease ~ ., scales = "free") +
#scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(breaks = c(1,10,100,1000))+
geom_vline(xintercept = which(levels(gene_per_disease_both_count$group)==20), linetype = "dashed", col = "red") + theme_cowplot() + xlab("Genes per disease term") + ylab("Number of terms")
p
#ggsave("../Figs/orphanet_individual_vs_grouped_diseases_bar.pdf", p, width = 3*1.5, height = 2*1.5)
Based on the plot above, accumulating gene-disease association for descendant disease terms of ‘Rare genetic disease’ ( “Orphanet:98053”) resulting in physiologically distinct disease groups where the majority (26 out of 28) groups are associated with sufficient amount of genes for module detection (\(n=20\)).
The disease gene association can be found in data/table_disease_gene_assoc_orphanet_genetic.tsv. The summary of disease groups and all associated genes are shown below.
rmarkdown::paged_table(gene_disease_orpha %>% select(name, N) %>% arrange(-N))
Omitting disease groups with fewer than 20 associated genes, the number of terms (out of 3771) and genes excluded from further analyses include:
# roots that are removed
removed_roots <- gene_disease_orpha %>% filter(N<20) %>% pull(name)
# table for removed terms
removed_disease_terms <- orphanet_gene_association %>%
mutate(n_roots = str_count(roots, ";")+1,
removed_roots = grepl(removed_roots[1], roots) + grepl(removed_roots[2], roots),
remained_roots = n_roots - removed_roots) %>%
filter(remained_roots == 0) %>%
select(orphaID, label, genes, roots)
removed_genes <- sapply(removed_disease_terms$genes, function(x) str_split(x, ";")) %>% unlist %>% unique()
knitr::kable(removed_disease_terms)
| orphaID | label | genes | roots |
|---|---|---|---|
| Orphanet:199241 | Pulmonary capillary hemangiomatosis | EIF2AK4 | Rare genetic respiratory disease |
| Orphanet:210122 | Congenital alveolar capillary dysplasia | FOXF1 | Rare genetic respiratory disease |
| Orphanet:217566 | Chronic respiratory distress with surfactant metabolism deficiency | SFTPC | Rare genetic respiratory disease |
| Orphanet:217563 | Neonatal acute respiratory distress due to SP-B deficiency | SFTPB | Rare genetic respiratory disease |
| Orphanet:440402 | Interstitial lung disease due to ABCA3 deficiency | ABCA3 | Rare genetic respiratory disease |
| Orphanet:440392 | Interstitial lung disease due to SP-C deficiency | SFTPC | Rare genetic respiratory disease |
| Orphanet:444092 | Autoimmune interstitial lung disease-arthritis syndrome | COPA | Rare genetic respiratory disease |
| Orphanet:31837 | Pulmonary venoocclusive disease | BMPR2;EIF2AK4 | Rare genetic respiratory disease |
| Orphanet:100051 | Hereditary angioedema type 2 | SERPING1 | Serpinopathy |
| Orphanet:100050 | Hereditary angioedema type 1 | SERPING1 | Serpinopathy |
There are 10 disease terms whose associated genes are not associated with any other disease groups, and hence these 8 genes are omitted.
# download the association
library(RColorBrewer)
orphanet_gene_association_unique_root <- orphanet_gene_association %>%
separate_rows(., roots, sep = ";", convert = T) %>% dplyr::filter(!is.na(roots)) %>%
mutate(roots = as.factor(roots))
## take a function to allow modifying alpha values
# Add an alpha value to a colour
add.alpha <- function(col=NULL, alpha=1){
if(missing(col))
stop("Please provide a vector of colours.")
apply(sapply(col, col2rgb)/255, 2,
function(x)
rgb(x[1], x[2], x[3], alpha=alpha))
}
# define colours for all disease groups
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nrow(gene_disease_orpha))
# add id 1-28
gene_disease_orpha$id = 1:nrow(gene_disease_orpha)
# add colour with corresponding alpha values to each disease group
gene_disease_orpha_mod <- gene_disease_orpha %>%
rowwise() %>%
mutate(col =mycolors[id])
#mutate(col = add.alpha(mycolors[id], N/max(gene_disease_orpha$N)))
gene_disease_orpha_mod$root = "Orphanet"
# load voronoi treemap package
if(!"voronoiTreemap" %in% rownames(installed.packages())){
pacman::p_load_gh("https://github.com/uRosConf/voronoiTreemap")
} else{
library(voronoiTreemap)
}
gene_disease_orpha_mod$plotlab = str_remove_all( gene_disease_orpha_mod$name, "Rare |genetic | disease| syndrome| disorder")
onto_json <- vt_export_json(vt_input_from_df(gene_disease_orpha_mod, hierachyVar0 = "root", hierachyVar1 = "name", hierachyVar2 = "name", colorVar = "col", weightVar = "N", labelVar = "plotlab"))
vt_d3(onto_json)